Read in Data

country <- read_csv("data/Country.csv", col_types = cols())
ind <- read_csv("data/Indicators.csv", col_types = cols())

First check-in

Focus on Mexico

mex <- ind %>% filter(CountryName == "Mexico")

mex_ <- mex %>% group_by(IndicatorCode) %>% 
  summarise(
    n = n(),
    CountryName,
    Year,
    IndicatorCode,
    IndicatorName = str_replace_all(IndicatorName, " ", "_"),
    Value
  ) %>% filter(n > 50) %>% ungroup()
## `summarise()` has grouped output by 'IndicatorCode'. You can override using the `.groups` argument.

Focus on United States

mex <- ind %>% filter(CountryName == "United States")
mex_ <- mex %>% group_by(IndicatorCode) %>% 
  summarise(
    n = n(),
    CountryName,
    Year,
    IndicatorCode,
    IndicatorName = str_replace_all(IndicatorName, " ", "_"),
    Value
  ) %>% filter(n > 50) %>% ungroup()
## `summarise()` has grouped output by 'IndicatorCode'. You can override using the `.groups` argument.
mex2 <- mex_ %>% select(CountryName, Year, IndicatorCode, Value) %>%
pivot_wider(
  names_from = IndicatorCode,
  values_from = Value,
  values_fill = 0
)
mex2

Correlation Matrix

# mex3 <- mex2 %>% select(-c(CountryName, Year, IndicatorCode)) %>% 
mex3 <- mex2 %>% select(-c(CountryName, Year)) %>% 
  cor() %>% 
  round(digits = 2) %>% 
  data.frame()
mex3
code_to_name <- function(code) {
  uh <- ind %>% select(IndicatorName, IndicatorCode) %>% filter(IndicatorCode == code) %>% unique()
  uh$IndicatorName
}

# code_to_name("AG.LND.ARBL.HA.PC")
# code_to_name("EN.URB.MCTY")
# code_to_name("NE.IMP.GNFS.ZS")
# code_to_name("NY.GSR.NFCY.CD")
# code_to_name("NE.TRD.GNFS.ZS")
# code_to_name("SP.POP.TOTL")
# code_to_name("TG.VAL.TOTL.GD.ZS")
# code_to_name("NY.TAX.NIND.CN")
# code_to_name("EN.POP.DNST")
# code_to_name("FI.RES.TOTL.CD")
# code_to_name("SP.POP.65UP.TO.ZS")
# code_to_name("NY.GDP.DEFL.ZS")
# code_to_name("NY.GDP.MKTP.KN")
# code_to_name("NY.GNP.MKTP.CN")

v <- c("AG.LND.ARBL.HA.PC", "EN.URB.MCTY", "NE.IMP.GNFS.ZS", "NY.GSR.NFCY.CD", "NE.TRD.GNFS.ZS", "SP.POP.TOTL", "TG.VAL.TOTL.GD.ZS", "NY.TAX.NIND.CN", "EN.POP.DNST", "FI.RES.TOTL.CD", "SP.POP.65UP.TO.ZS")

Find highly correlated with GDP

mex3['NY.GDP.PCAP.CD'] %>% filter(abs(NY.GDP.PCAP.CD) < 0.95 & abs(NY.GDP.PCAP.CD) > 0.3) %>% arrange(-abs(NY.GDP.PCAP.CD))
# mex3['GDP_per_capita_(current_US$)    '] %>% arrange(GDP_per_capita_(current_US$))
# mex3['NY.GDP.PCAP.CD'] %>% arrange(NY.GDP.PCAP.CD)

mex3['NY.GDP.PCAP.CD'] %>% filter(abs(NY.GDP.PCAP.CD) > 0.95 & abs(NY.GDP.PCAP.CD) < 1) %>% arrange(NY.GDP.PCAP.CD)
# mex3['Land_area_(sq._km)'] %>% filter(abs(NY.GDP.PCAP.CD) > 0.9) %>% arrange(NY.GDP.PCAP.CD)

mex2_ <- mex2 %>% select(-c(CountryName, Year))

cor(mex2_) %>% data.frame()

Correlation Matrix

mex2_ <- mex2_ %>% select(AG.LND.ARBL.HA.PC,
                          EN.URB.MCTY,
                          NE.IMP.GNFS.ZS,
                          NY.GSR.NFCY.CD,
                          NE.TRD.GNFS.ZS,
                          SP.POP.TOTL,
                          TG.VAL.TOTL.GD.ZS,
                          # NY.TAX.NIND.CN,
                          EN.POP.DNST,
                          FI.RES.TOTL.CD,
                          SP.POP.65UP.TO.ZS)

# corrplot.mixed(
#   round(cor(mex2_), 2),
#   lower = "number",
#   upper = "color"
# )

cor(mex2_) %>% corrplot(diag = FALSE, order = 'AOE')

# code_to_name("AG.LND.ARBL.HA.PC")
# code_to_name("EN.URB.MCTY")
# code_to_name("NE.IMP.GNFS.ZS")
# code_to_name("NY.GSR.NFCY.CD")
# code_to_name("NE.TRD.GNFS.ZS")
# code_to_name("SP.POP.TOTL")
# code_to_name("TG.VAL.TOTL.GD.ZS")
# code_to_name("NY.TAX.NIND.CN")
# code_to_name("EN.POP.DNST")
# code_to_name("FI.RES.TOTL.CD")
# code_to_name("SP.POP.65UP.TO.ZS")

# corrplot.mixed(
#   round(cor(mex2_), 2),
#   lower = "number",
#   upper = "color"
# )
Code Name
AG.LND.ARBL.HA.PC Arable land (hectares per person)
EN.URB.MCTY Population in urban agglomerations of more than 1 million
NE.IMP.GNFS.ZS Imports of goods and services (% of GDP)
NY.GSR.NFCY.CD Net income from abroad (current US\() | | NE.TRD.GNFS.ZS | Trade (% of GDP) | | SP.POP.TOTL | Population, total | | TG.VAL.TOTL.GD.ZS | Merchandise trade (% of GDP) | | NY.TAX.NIND.CN | Net taxes on products (current LCU) | | EN.POP.DNST | Population density (people per sq. km of land area) | | FI.RES.TOTL.CD | Total reserves (includes gold, current US\))
SP.POP.65UP.TO.ZS Population ages 65 and above (% of total)

Data sparsity exploration

Testing Number of Values

# ind %>% group_by(CountryName) %>% 
#   summarise(
#     n_ind_many_year = group_by(IndicatorName) %>% 
#       summarise(
#         
#       )
#   )

ind %>% group_by(CountryName) %>% 
  summarise(
    n = n(),
    n_year = Year %>% unique() %>% length(),
    n_ind = IndicatorName %>% unique() %>% length()
  ) %>% arrange(n, n_ind)
ind %>% group_by(CountryName) %>% 
  summarise(
    n = n(),
    n_year = Year %>% unique() %>% length(),
    n_ind = IndicatorName %>% unique() %>% length()
  ) %>% arrange(CountryName)

What Indicators are in ALL conutries?

# what indicators are in ALL countries?
foo <- ind %>% group_by(IndicatorName) %>% 
  summarise(
    n = n(),
    n_count = CountryName %>% unique() %>% length()
  ) %>% arrange(-n_count) %>% 
  select(n_count, n, IndicatorName)
foo
foo %>% nrow()
## [1] 1344
ind %>% group_by(IndicatorName) %>% 
  summarise(
    n_val = Value %>% length()
  ) %>% arrange(-n_val)
foo %>% filter( n_count > 200)
ind
ind %>%
  filter(CountryName=="United States" & IndicatorCode=="SP.ADO.TFRT") %>%
  ggplot(aes(x=Year, y=Value)) +
  geom_point()